{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "99a4a276614c4abdb7af5039a0d56241",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fbc2b2ee9cc64615ae8ba766281f1643",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3c151eb7db834aa299a7ff7f0847f4b6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0e36ae1d6fd14d4a9fa258bc94af4ced",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
     ]
    }
   ],
   "source": [
    "from transformers import GPT2Tokenizer\n",
    "from transformers import (\n",
    "    AutoTokenizer,\n",
    "    T5ForConditionalGeneration,\n",
    "    AutoConfig,\n",
    ")\n",
    "from tokenizers import AddedToken\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained('malaysia-ai/bpe-tokenizer')\n",
    "tokenizer_t5 = AutoTokenizer.from_pretrained('google/t5-v1_1-base')\n",
    "additional = []\n",
    "for t in tokenizer_t5.additional_special_tokens:\n",
    "    additional.append(AddedToken(t))\n",
    "tokenizer.add_special_tokens({\"additional_special_tokens\": additional})\n",
    "tokenizer.pad_token = '<s>'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PreTrainedTokenizerFast(name_or_path='malaysia-ai/bpe-tokenizer', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<s>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45>', '<extra_id_46>', '<extra_id_47>', '<extra_id_48>', '<extra_id_49>', '<extra_id_50>', '<extra_id_51>', '<extra_id_52>', '<extra_id_53>', '<extra_id_54>', '<extra_id_55>', '<extra_id_56>', '<extra_id_57>', '<extra_id_58>', '<extra_id_59>', '<extra_id_60>', '<extra_id_61>', '<extra_id_62>', '<extra_id_63>', '<extra_id_64>', '<extra_id_65>', '<extra_id_66>', '<extra_id_67>', '<extra_id_68>', '<extra_id_69>', '<extra_id_70>', '<extra_id_71>', '<extra_id_72>', '<extra_id_73>', '<extra_id_74>', '<extra_id_75>', '<extra_id_76>', '<extra_id_77>', '<extra_id_78>', '<extra_id_79>', '<extra_id_80>', '<extra_id_81>', '<extra_id_82>', '<extra_id_83>', '<extra_id_84>', '<extra_id_85>', '<extra_id_86>', '<extra_id_87>', '<extra_id_88>', '<extra_id_89>', '<extra_id_90>', '<extra_id_91>', '<extra_id_92>', '<extra_id_93>', '<extra_id_94>', '<extra_id_95>', '<extra_id_96>', '<extra_id_97>', '<extra_id_98>', '<extra_id_99>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={\n",
       "\t0: AddedToken(\"<pad>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t1: AddedToken(\"<s>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t2: AddedToken(\"</s>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t3: AddedToken(\"<unk>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32000: AddedToken(\"<extra_id_0>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32001: AddedToken(\"<extra_id_1>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32002: AddedToken(\"<extra_id_2>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32003: AddedToken(\"<extra_id_3>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32004: AddedToken(\"<extra_id_4>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32005: AddedToken(\"<extra_id_5>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32006: AddedToken(\"<extra_id_6>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32007: AddedToken(\"<extra_id_7>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32008: AddedToken(\"<extra_id_8>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32009: AddedToken(\"<extra_id_9>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32010: AddedToken(\"<extra_id_10>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32011: AddedToken(\"<extra_id_11>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32012: AddedToken(\"<extra_id_12>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32013: AddedToken(\"<extra_id_13>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32014: AddedToken(\"<extra_id_14>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32015: AddedToken(\"<extra_id_15>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32016: AddedToken(\"<extra_id_16>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32017: AddedToken(\"<extra_id_17>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32018: AddedToken(\"<extra_id_18>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32019: AddedToken(\"<extra_id_19>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32020: AddedToken(\"<extra_id_20>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32021: AddedToken(\"<extra_id_21>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32022: AddedToken(\"<extra_id_22>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32023: AddedToken(\"<extra_id_23>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32024: AddedToken(\"<extra_id_24>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32025: AddedToken(\"<extra_id_25>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32026: AddedToken(\"<extra_id_26>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32027: AddedToken(\"<extra_id_27>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32028: AddedToken(\"<extra_id_28>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32029: AddedToken(\"<extra_id_29>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32030: AddedToken(\"<extra_id_30>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32031: AddedToken(\"<extra_id_31>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32032: AddedToken(\"<extra_id_32>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32033: AddedToken(\"<extra_id_33>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32034: AddedToken(\"<extra_id_34>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32035: AddedToken(\"<extra_id_35>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32036: AddedToken(\"<extra_id_36>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32037: AddedToken(\"<extra_id_37>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32038: AddedToken(\"<extra_id_38>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32039: AddedToken(\"<extra_id_39>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32040: AddedToken(\"<extra_id_40>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32041: AddedToken(\"<extra_id_41>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32042: AddedToken(\"<extra_id_42>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32043: AddedToken(\"<extra_id_43>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32044: AddedToken(\"<extra_id_44>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32045: AddedToken(\"<extra_id_45>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32046: AddedToken(\"<extra_id_46>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32047: AddedToken(\"<extra_id_47>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32048: AddedToken(\"<extra_id_48>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32049: AddedToken(\"<extra_id_49>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32050: AddedToken(\"<extra_id_50>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32051: AddedToken(\"<extra_id_51>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32052: AddedToken(\"<extra_id_52>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32053: AddedToken(\"<extra_id_53>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32054: AddedToken(\"<extra_id_54>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32055: AddedToken(\"<extra_id_55>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32056: AddedToken(\"<extra_id_56>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32057: AddedToken(\"<extra_id_57>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32058: AddedToken(\"<extra_id_58>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32059: AddedToken(\"<extra_id_59>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32060: AddedToken(\"<extra_id_60>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32061: AddedToken(\"<extra_id_61>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32062: AddedToken(\"<extra_id_62>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32063: AddedToken(\"<extra_id_63>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32064: AddedToken(\"<extra_id_64>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32065: AddedToken(\"<extra_id_65>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32066: AddedToken(\"<extra_id_66>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32067: AddedToken(\"<extra_id_67>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32068: AddedToken(\"<extra_id_68>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32069: AddedToken(\"<extra_id_69>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32070: AddedToken(\"<extra_id_70>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32071: AddedToken(\"<extra_id_71>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32072: AddedToken(\"<extra_id_72>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32073: AddedToken(\"<extra_id_73>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32074: AddedToken(\"<extra_id_74>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32075: AddedToken(\"<extra_id_75>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32076: AddedToken(\"<extra_id_76>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32077: AddedToken(\"<extra_id_77>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32078: AddedToken(\"<extra_id_78>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32079: AddedToken(\"<extra_id_79>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32080: AddedToken(\"<extra_id_80>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32081: AddedToken(\"<extra_id_81>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32082: AddedToken(\"<extra_id_82>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32083: AddedToken(\"<extra_id_83>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32084: AddedToken(\"<extra_id_84>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32085: AddedToken(\"<extra_id_85>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32086: AddedToken(\"<extra_id_86>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32087: AddedToken(\"<extra_id_87>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32088: AddedToken(\"<extra_id_88>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32089: AddedToken(\"<extra_id_89>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32090: AddedToken(\"<extra_id_90>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32091: AddedToken(\"<extra_id_91>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32092: AddedToken(\"<extra_id_92>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32093: AddedToken(\"<extra_id_93>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32094: AddedToken(\"<extra_id_94>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32095: AddedToken(\"<extra_id_95>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32096: AddedToken(\"<extra_id_96>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32097: AddedToken(\"<extra_id_97>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32098: AddedToken(\"<extra_id_98>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t32099: AddedToken(\"<extra_id_99>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('./out-base-1.1/tokenizer_config.json',\n",
       " './out-base-1.1/special_tokens_map.json',\n",
       " './out-base-1.1/tokenizer.json')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.save_pretrained('./out-base-1.1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = AutoConfig.from_pretrained(\n",
    "    'google/t5-v1_1-base'\n",
    ")\n",
    "config.dropout_rate = 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = T5ForConditionalGeneration(\n",
    "    config,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained('./out-base-1.1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
